Multiple linear regression
Before begin..
Let’s load the SBP dataset.
dataset_sbp <- read.csv(file = "Inha/5_Lectures/2024/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")
head(dataset_sbp)
## SEX BTH_G SBP DBP FBS DIS BMI
## 1 1 1 116 78 94 4 16.6
## 2 1 1 100 60 79 4 22.3
## 3 1 1 100 60 87 4 21.9
## 4 1 1 111 70 72 4 20.2
## 5 1 1 120 80 98 4 20.0
## 6 1 1 115 79 95 4 23.1
Making a new variable hypertension
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
dataset_sbp$DBP > 80,
T,
F)
dataset_sbp$history_of_hypertension <- ifelse(dataset_sbp$DIS == 1 |
dataset_sbp$DIS == 2,
T,
F)
set.seed(1)
dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))
dataset_sbp_small$DIS <- as.factor(dataset_sbp_small$DIS)
Simple linear regression
With lm() function, we can calculate the linear model of
DBP~SBP with the least squares algorithm.
#A function that returns equation
lm_eqn <- function(m){
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
ggplot(data = dataset_sbp_small, aes(y = SBP, x = DBP)) +
geom_point() +
theme_classic(base_family = "serif", base_size = 20) +
ylab("SBP (mmHg)") +
xlab("DBP (mmHg)") +
geom_smooth(method = "lm") +
geom_label(y = 170, x = 68,
label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ DBP)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
lm(data = dataset_sbp_small, SBP ~ DBP) %>% confint()
## 2.5 % 97.5 %
## (Intercept) 32.20308 41.168723
## DBP 1.05807 1.175343
#rnorm(100, mean = 100, sd = 1)
Multiple regression
Basic concept
Deviding samples
In fact, there are two SEXs in the sample.
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
#geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
#geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point(aes(col = history_of_hypertension)) +
ylab("SBP (mmHg)") +
xlab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
theme(legend.position = "top") +
guides(col=guide_legend(title = "History of hyper tension"))
Deviding samples
In fact, there are two SEXs in the sample.
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
#geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
#geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point(aes(col = history_of_hypertension)) +
ylab("SBP (mmHg)") +
xlab("DBP (mmHg)") +
facet_wrap(~history_of_hypertension) +
theme_classic( base_family = "serif", base_size = 20) +
theme(legend.position = "top",
plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
guides(col=guide_legend(title = "History of hyper tension")) +
geom_abline(intercept = 37, slope = 1.1, col = "blue") +
ggtitle("y = 37 + 1.1 x")
# Multiple linear regression
lm(data = dataset_sbp_small, SBP ~ DBP) %>% summary
##
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.723 -6.022 -1.180 5.145 46.312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.68590 2.28442 16.06 <2e-16 ***
## DBP 1.11671 0.02988 37.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.487 on 998 degrees of freedom
## Multiple R-squared: 0.5832, Adjusted R-squared: 0.5828
## F-statistic: 1397 on 1 and 998 DF, p-value: < 2.2e-16
lm(data = dataset_sbp_small, SBP ~ DBP + history_of_hypertension) %>% summary
##
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.590 -4.500 -0.984 5.702 40.255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.40433 2.23748 18.058 <2e-16 ***
## DBP 1.04867 0.02974 35.256 <2e-16 ***
## history_of_hypertensionTRUE 6.42067 0.71630 8.964 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared: 0.6143, Adjusted R-squared: 0.6135
## F-statistic: 794 on 2 and 997 DF, p-value: < 2.2e-16
palette()
## [1] "black" "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
#geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
#geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point(aes(col = history_of_hypertension)) +
ylab("SBP (mmHg)") +
xlab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
theme(legend.position = "top") +
guides(col=guide_legend(title = "History of hyper tension")) +
geom_abline(intercept = 40, slope = 1.04, col = "#F8766D") +
annotate(geom="text", x=120, y=140, label="LM of HT-False",
color="#F8766D", family = "serif", size = 8) +
geom_abline(intercept = 46.42, slope = 1.04, col = "#00BFC4") +
annotate(geom="text", x=100, y=170, label="LM of HT-True",
color="#00BFC4", family = "serif", size = 8)
Continuous variable as covariate
Deviding samples
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
#geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point() +
ylab("SBP (mmHg)") +
xlab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
theme(legend.position = "top",
plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
guides(col=guide_legend(title = "History of hyper tension")) +
geom_label(y = 170, x = 68,
label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ DBP)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
ggplot(dataset_sbp_small, aes(y = SBP, x = BTH_G)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
#geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point() +
ylab("SBP (mmHg)") +
xlab("Age group") +
theme_classic( base_family = "serif", base_size = 20) +
theme(legend.position = "top",
plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
guides(col=guide_legend(title = "History of hyper tension")) +
geom_label(y = 170, x = 18,
label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ BTH_G)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
lm(data = dataset_sbp, formula = SBP ~ DBP)
##
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp)
##
## Coefficients:
## (Intercept) DBP
## 38.144 1.105
model_result <- lm(data = dataset_sbp, formula = SBP ~ DBP)
dataset_sbp$DIS <- as.factor(dataset_sbp$DIS)
dataset_sbp %>% summary
## SEX BTH_G SBP DBP
## Min. :1.00 Min. : 1.00 Min. : 82.0 Min. : 50.00
## 1st Qu.:1.00 1st Qu.: 9.00 1st Qu.:110.0 1st Qu.: 70.00
## Median :1.00 Median :14.00 Median :120.0 Median : 76.00
## Mean :1.49 Mean :13.91 Mean :121.9 Mean : 75.79
## 3rd Qu.:2.00 3rd Qu.:19.00 3rd Qu.:130.0 3rd Qu.: 80.00
## Max. :2.00 Max. :27.00 Max. :190.0 Max. :120.00
## FBS DIS BMI hypertension
## Min. : 60.00 1: 53398 Min. :14.8 Mode :logical
## 1st Qu.: 87.00 2:162826 1st Qu.:21.5 FALSE:683197
## Median : 94.00 3: 43114 Median :23.6 TRUE :316803
## Mean : 98.86 4:740662 Mean :23.8
## 3rd Qu.:104.00 3rd Qu.:25.8
## Max. :358.00 Max. :40.3
## history_of_hypertension
## Mode :logical
## FALSE:783776
## TRUE :216224
##
##
##
dataset_sbp %>%
lm(formula = SBP ~ DBP + DIS + DBP * DIS) %>%
summary
##
## Call:
## lm(formula = SBP ~ DBP + DIS + DBP * DIS, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.292 -4.998 -0.600 5.474 89.299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.478249 0.325920 170.220 < 2e-16 ***
## DBP 0.958139 0.004126 232.202 < 2e-16 ***
## DIS2 -4.460697 0.375896 -11.867 < 2e-16 ***
## DIS3 -7.903914 0.499333 -15.829 < 2e-16 ***
## DIS4 -16.772084 0.337319 -49.722 < 2e-16 ***
## DBP:DIS2 0.035531 0.004735 7.504 6.17e-14 ***
## DBP:DIS3 0.042764 0.006454 6.626 3.45e-11 ***
## DBP:DIS4 0.120510 0.004285 28.124 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.415 on 999992 degrees of freedom
## Multiple R-squared: 0.5819, Adjusted R-squared: 0.5819
## F-statistic: 1.989e+05 on 7 and 999992 DF, p-value: < 2.2e-16
dataset_sbp_small %>%
lm(SBP ~ DBP + BTH_G, data = .) %>%
summary
##
## Call:
## lm(formula = SBP ~ DBP + BTH_G, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.618 -5.813 -0.839 5.478 44.670
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.41804 2.18271 15.77 <2e-16 ***
## DBP 1.06715 0.02881 37.05 <2e-16 ***
## BTH_G 0.43020 0.04153 10.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.019 on 997 degrees of freedom
## Multiple R-squared: 0.6237, Adjusted R-squared: 0.623
## F-statistic: 826.4 on 2 and 997 DF, p-value: < 2.2e-16
dataset_sbp_small %>%
lm(SBP ~ DBP + SEX, data = .) %>%
summary
##
## Call:
## lm(formula = SBP ~ DBP + SEX, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.063 -5.970 -1.219 5.211 45.634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.03083 2.63553 14.809 <2e-16 ***
## DBP 1.10698 0.03035 36.480 <2e-16 ***
## SEX -1.08426 0.60970 -1.778 0.0757 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.477 on 997 degrees of freedom
## Multiple R-squared: 0.5846, Adjusted R-squared: 0.5837
## F-statistic: 701.4 on 2 and 997 DF, p-value: < 2.2e-16
car::avPlots(dataset_sbp_small %>%
lm(SBP ~ DBP + history_of_hypertension, data = .))
car::avPlots(dataset_sbp_small %>%
lm(SBP ~ DBP + history_of_hypertension, data = .))
dataset_sbp_small %>%
lm(SBP ~ SEX, data = .) %>%
summary
##
## Call:
## lm(formula = SBP ~ SEX, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.779 -8.779 0.221 10.221 55.221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.8716 1.4333 89.91 < 2e-16 ***
## SEX -5.0921 0.9159 -5.56 3.47e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.47 on 998 degrees of freedom
## Multiple R-squared: 0.03004, Adjusted R-squared: 0.02907
## F-statistic: 30.91 on 1 and 998 DF, p-value: 3.47e-08
dataset_sbp_small %>%
lm(SBP ~ history_of_hypertension, data = .) %>%
summary
##
## Call:
## lm(formula = SBP ~ history_of_hypertension, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.304 -8.438 0.562 9.913 51.562
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118.4381 0.4911 241.2 <2e-16 ***
## history_of_hypertensionTRUE 12.8654 1.0376 12.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.68 on 998 degrees of freedom
## Multiple R-squared: 0.1335, Adjusted R-squared: 0.1326
## F-statistic: 153.7 on 1 and 998 DF, p-value: < 2.2e-16
dataset_sbp_small %>%
lm(SBP ~ DBP + history_of_hypertension, data = .) %>%
summary
##
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.590 -4.500 -0.984 5.702 40.255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.40433 2.23748 18.058 <2e-16 ***
## DBP 1.04867 0.02974 35.256 <2e-16 ***
## history_of_hypertensionTRUE 6.42067 0.71630 8.964 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared: 0.6143, Adjusted R-squared: 0.6135
## F-statistic: 794 on 2 and 997 DF, p-value: < 2.2e-16
dataset_sbp_small %>%
lm(SBP ~ DBP + history_of_hypertension, data = .) %>%
car::avPlots()
dataset_sbp_small %>%
ggplot(., aes(y = SBP, x = history_of_hypertension)) +
geom_jitter()
Multiple linear regression
lm(data = dataset_sbp_small, SBP ~ DBP) %>% summary
##
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.723 -6.022 -1.180 5.145 46.312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.68590 2.28442 16.06 <2e-16 ***
## DBP 1.11671 0.02988 37.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.487 on 998 degrees of freedom
## Multiple R-squared: 0.5832, Adjusted R-squared: 0.5828
## F-statistic: 1397 on 1 and 998 DF, p-value: < 2.2e-16
lm(data = dataset_sbp_small, SBP ~ DBP + history_of_hypertension) %>% summary
##
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.590 -4.500 -0.984 5.702 40.255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.40433 2.23748 18.058 <2e-16 ***
## DBP 1.04867 0.02974 35.256 <2e-16 ***
## history_of_hypertensionTRUE 6.42067 0.71630 8.964 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared: 0.6143, Adjusted R-squared: 0.6135
## F-statistic: 794 on 2 and 997 DF, p-value: < 2.2e-16
palette()
## [1] "black" "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
#geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
#geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point(aes(col = history_of_hypertension)) +
ylab("SBP (mmHg)") +
xlab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
theme(legend.position = "top") +
guides(col=guide_legend(title = "History of hyper tension")) +
geom_abline(intercept = 40, slope = 1.04, col = "#F8766D") +
annotate(geom="text", x=120, y=140, label="LM of HT-False",
color="#F8766D", family = "serif", size = 8) +
geom_abline(intercept = 46.42, slope = 1.04, col = "#00BFC4") +
annotate(geom="text", x=100, y=170, label="LM of HT-True",
color="#00BFC4", family = "serif", size = 8)
Simulation
Bibliography
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.